/*******************************************************************************
  ARTICLE	GAY, GOBBI, GONI (2025) "REVOLUTIONARY TRANSITIONS. INHERITANCE    
            CHANGE AND FERTILITY DECLINE" JOURNAL OF POLITICAL ECONOMY         
                                                                               
  AUTHORS	VICTOR GAY, PAULA GOBBI, MARC GONI                                 
  CONTACT	victor.gay@tse-fr.eu; paula.eugenia.gobbi@ulb.be; marc.goni@uib.no 
  VERSION	1.0 (MAY 2025)                                                     
  SOFTWARE	STATA SE 18                                                        
  LICENCE	MIT                                                                
--------------------------------------------------------------------------------

GENI DATA PREPARATION DO FILE - FINAL DATASET PREPARATION

This file combines the Geni sample, data on inheritance customs and controls, and generates variables for analysis.

Instructions: 
-------------
	Gain access to the Geni database from MyHeritage, Ltd and either:
	
	1) place the following files:

		geni_profiles.csv (20.2 GB, created on April 4, 2022, at 11:39:31),
		geni_unions.csv (7.2 GB, created on April 4, 2022, at 11:55:59),
		geni_union details.csv (2.6 GB, created on April 4, 2022, at 11:54:34).
	
	in folder /1_raw_data/1_1_henri/ (see README for more details) and run R-codes 
	named 1-* to 8-* in folder "/2_scripts/2_1_data/01_geni_data_to_sample"; or
	
	2) place the author-provided fr-clean.csv file into the folder 3_outputs/3_1_datasets.
 
	After 1) or 2), run do-file 02_geni-inheritance-and-controls.do
 
	Open do-files from directory where they are placed; order matters; run whole code.

Do-file structure: 
------------------
	0. PROGRAM SETUP
	1. GEOLOCATIONS AND CONTROLS
	2. GENI
	3. VARIABLES
	
Main sources: 
-------------
	Geni database - online genealogies in geni.com, a MyHeritage Company
	Inheritance customs (own collection)
	Geolocated communes Henry (own collection)
		 
Other sources: 
--------------
	Wheat prices (Ridolfi 2019)
	Population density (Cristofoli et al., 2021; IGN, 2021)
	Administrative centers (Nordman, Ozouf-Marignier, and Laclau, 1989 pp. 74–80)
	Political societies (Boutier, Boutry, and Bonin, 1992, pp. 77–101)
	Cassini road (Perret, Gribaudi, and Barthelemy, 2015)
	Rebellions (Nicolas 2002; Gay 2025)
	Horse-post network (Albertus and Gay, 2025)
	Caloric suitability (Galor and Ozak, 2016)
	Ruggedness (Nunn and Puga, 2012)
	Soil texture (INRA, 1998)
	Refractory clergy in 1791 (Tacket 1984)
		 
*/
********************************************************************************

********************
* 0. PROGRAM SETUP *
********************

version 18
clear all
set more off

timer on 1

* ------------------------------------------------------------------------------
* 1. GEOLOCATIONS AND CONTROLS
* ------------------------------------------------------------------------------
* do "data_master.do"
use "../2_0_tempfiles/geni_controls.dta", clear
gen profile_id0 = profile_id
recast str100 profile_id
save "../2_0_tempfiles/geni_controls-modified.dta", replace

* ------------------------------------------------------------------------------
* 2. GENI
* ------------------------------------------------------------------------------
import delimited "../../3_outputs/3_1_datasets/fr-clean.csv", encoding(UTF-8) delimiter(comma) bindquote(strict) stringcols(1 3 5 6 7) clear
keep profile_id flag_2punion punion flag_4munion munion1 munion2 munion3 ///
	 first_name middle_name last_name maiden_name occupation gender ///
	 birth_range birth_start_year birth_start_circa ///
	 marriage_range marriage_start_year marriage_start_circa has_spouse  ///
	 death_range death_start_year death_start_circa ///
	 marriage_age_rough bad_marriage_age death_age_rough bad_death_age ///
	 birth_order birth_order_s ///
	 twoplus_flag nna nkids d5

merge 1:1 profile_id using "../2_0_tempfiles/geni_controls-modified.dta"
keep if _merge==3
keep if latitude!=. & longitude!=.
erase "../2_0_tempfiles/geni_controls-modified.dta"
label var profile_id "Profile id for persons in Geni.com"
label var twoplus_flag "Horizontal sample [Blanc 2024] (1 = one of four preceding gen. has >1 offspring)"
label var gender "Gender (f=female, m=male)"
label var birth_start_circa "Birth year approximated (t=yes, f=no)"
label var insee_com "Municipality INSEE code"
label var pop_densit~1793 "Population density in 1793 (per km2) [Cristofoli et al. 2021, GEOFLA 2011]"

* ------------------------------------------------------------------------------
* 3. VARIABLES
* ------------------------------------------------------------------------------

* fertility variable
gen nfert = nkids-d5 // nkids gives "completed fertility of mothers, all"; d5 gives "infant deaths (<5)"
label var nfert "Completed fertility of mothers (births surv age 6)"

* birth year
gen byear = birth_start_year
label var byear "Birth year"

* exposure: years fertile post-reform
local fage = 40
local tage = 40-15
gen T = 0 if 1793-byear>40
forvalues i=0/`tage'{
replace T = `i' if 1793-byear==40-`i'
}
replace T = `tage' if 1793-byear<15
replace T = .  if byear==.
label var T "Years fertile post reform"
gen T01 = (T>0)
label var T01 "=1 if cohort fertile post reform"

* reform treatments
rename womenexc womexc
rename fem_included wominc
label var affected "Affected by 1793 inheritance reforms [Gay, Gobbi, Goñi 2023]"
label var impart "Impartible inheritance before 1793 reforms [Gay, Gobbi, Goñi 2023]"
label var partible "Partible inheritance before 1793 reforms [Gay, Gobbi, Goñi 2023]"
label var womexc "Women excluded from inheritance before 1793 reforms [Gay, Gobbi, Goñi 2023]"
label var wominc "Women included in inheritance before 1793 reforms [Gay, Gobbi, Goñi 2023]"

* interaction
gen T01xaffected = T01*affected
label var T01xaffected "Reformed inheritance x fertile post-reform"

* Distance to inheritance borders
gen dista = dist_affected if affected==1
replace dista= -dist_affected if affected==0
gen dista_sq = dista^2
label var dista "Distance to inheritance border (km) [Gay, Gobbi, Goñi 2023]"
label var dista_sq "Distance^2 to inheritance border (km) [Gay, Gobbi, Goñi 2023]"

* Wheat prices at age fertile cycle starts
gen year15 = byear+15
gen pwheat_age15 = pw1710 if year15>=1710 & year15<=1719
forvalues t=1720(10)1790{
local t2=`t'+9
replace pwheat_age15 = pw`t' if year15>=`t' & year15<=`t2'
}
replace pwheat_age15 = pw1790 if year15>=1790 & year15!=.
gen logpwheat = log(pwheat_age15)
label var logpwheat "Wheat price in decade when women turned 15 (in logs)"
label var pwheat_age15 "Wheat price in decade when women turned 15"

* population density in logs
gen ldensity_1793 = log(pop_density_1793)
label var ldensity_1793 "Log population density in 1793"

* use indicators instead of distances as explained in paper
foreach x in eveche socpol rebellion bailliage subdeleg recette post{
gen near_`x' = (dist_`x'<15)
}
gen near_cassini = (dist_cassini<7.5)
label var near_eveche "=1 if évêché capital within 15km [Nordman et al. 1989]"
label var near_socpol "=1 if political society within 15km [Boutier et al. 1992]" 
label var near_rebellion "=1 if rebellion against the state in 1780s within 15km [Gay 2025]" 
label var near_bailliage "=1 if bailliage capital within 15km [Nordman et al. 1989]" 
label var near_subdeleg "=1 if subdélégation capital within 15km [Nordman et al. 1989]"
label var near_recette "=1 if recette des finances capital within 15km [Nordman et al. 1989]"
label var near_post "=1 if horse-post relay in 1790 within 15km [Albertus and Gay 2025]"
label var near_cassini "=1 if Cassini road in 1750-90 within 7.5km [Perret et al. 2015]"

* Bailliage ids format
* name ids 1 to 9 as 001 to 009, and ids 10 to 99 as 010 to 099.
qui{
gen lbai=length(bailliage_id)
foreach i in 1 2 3 4 5 6 7 8 9{
replace bailliage_id= "00`i'" if bailliage_id=="`i'" & lbai==1
}
foreach j in 1 2 3 4 5 6 7 8 9{
foreach i in 0 1 2 3 4 5 6 7 8 9{
replace bailliage_id= "0`j'`i'" if bailliage_id=="`j'`i'"
}
}
drop lbai
}

* Hosekeeping
drop flag_2punion punion flag_4munion munion1 munion2 munion3 marriage_range marriage_start_year marriage_start_circa has_spouse first_name middle_name last_name maiden_name occupation birth_range birth_start_year death_range death_start_year death_start_circa death_age_rough bad_death_age marriage_age_rough bad_marriage_age birth_order birth_order_s nkids nna insee_com1 nom_comm longitude_rgf93 pop_density_1800 latitude_rgf93 pw1700 pw1710 pw1720 pw1730 pw1740 pw1750 pw1760 pw1770 pw1790 pop_1800 area bailliage_name texture sandy texture_buffer sh_sandy_buffer ruggedness_buffer avcalmean profile_id0 _merge dist_affected year15
*d5

order profile_id twoplus_flag gender birth_start_circa insee_com byear nfert affected partible impart wominc womexc dista segment_affected_50 segment_affected_100 segment_affected_50_length segment_affected_100_length bailliage_id latitude longitude T T01 T01xaffected dist_socpol near_socpol dist_eveche near_eveche dist_recette near_recette dist_subdeleg near_subdeleg dist_bailliage near_bailliage dist_rebellion near_rebellion dist_cassini near_cassini dist_post near_post cl_peril_i pw1780 pwheat_age15 logpwheat pop_1793 pop_density_1793 ldensity_1793 sh_sandy ruggedness rural_henry 

* Save dataset
save "../../3_outputs/3_1_datasets/final-geni.dta", replace

timer off 1 /* 100 seconds */
timer list
